package com.frogchou.content.getweibo;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Iterator;

import org.eclipse.jetty.io.BuffersFactory;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.frogchou.content.getcontent.WeiboCN;
import com.frogchou.utils.FccUtils;

/**
 * 利用WebCollector和获取的cookie爬取新浪微博并抽取数据
 * @author hu
 */
public class WeiboCrawler extends BreadthCrawler {
    String cookie;
    public WeiboCrawler(String crawlPath, boolean autoParse) throws Exception {
        super(crawlPath, autoParse);
        /*获取新浪微博的cookie，账号密码以明文形式传输，请使用小号*/
        cookie = WeiboCN.getSinaCookie("frogchou@126.com", "frog419X");
    }

    @Override
    public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception {
        HttpRequest request = new HttpRequest(crawlDatum);
        request.setCookie(cookie);
        return request.getResponse();
    }

    @Override
    public void visit(Page page, CrawlDatums next) {
        //int pageNum = Integer.valueOf(page.getMetaData("pageNum"));
        /*抽取微博*/
    	
    	Elements links=page.doc().select("a[href]");
    	for (Element link : links) {
    		String linkStr = link.attr("abs:href");
    		next.add(linkStr);
		}
    	
        String savePath="D:\\spide\\save\\html\\";
        String title=page.doc().title();
        String notSportCharArray[] ={"\\","/","*",":","|","?","\"","<",">"};
        
        for (int i = 0; i < notSportCharArray.length; i++) {
			if (title.indexOf(notSportCharArray[i])>-1){
				title=title.replace(notSportCharArray[i], "");
			}
		}
        File path=new File(savePath);
        if (!path.exists()){
        	path.mkdirs();
        }
        File saveFile =new File(savePath+title+FccUtils.getRandomNum()+".html");
        if (!saveFile.exists())
        {try {
			saveFile.createNewFile();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
        }
        FileOutputStream fos;
        try {
        	fos=new FileOutputStream(saveFile);
        	OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");  
            osw.write(page.getHtml());  
            osw.close();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
        
    }

    public static void main(String[] args) throws Exception {
        WeiboCrawler crawler = new WeiboCrawler("weibo_crawler", false);
        crawler.setThreads(15);
        /*对某人微博前5页进行爬取*/
        crawler.addSeed(new CrawlDatum("http://weibo.cn/"));
        crawler.addRegex("http://weibo.cn/.*");
        crawler.start(8);
    }
}